var bibbase_data = {"data":"\"Loading..\"\n\n
\n\n \n\n \n\n \n \n\n \n\n \n \n\n \n\n \n
\n generated by\n \n \"bibbase.org\"\n\n \n
\n \n\n
\n\n \n\n\n
\n\n Excellent! Next you can\n create a new website with this list, or\n embed it in an existing web page by copying & pasting\n any of the following snippets.\n\n
\n JavaScript\n (easiest)\n
\n \n <script src=\"https://bibbase.org/show?bib=https://largo.lip6.fr/~cassagnea/data/Cassagne.bib&jsonp=1&group0=type&theme=side&owner=Cassagne,%20A&titleLinks=true&jsonp=1\"></script>\n \n
\n\n PHP\n
\n \n <?php\n $contents = file_get_contents(\"https://bibbase.org/show?bib=https://largo.lip6.fr/~cassagnea/data/Cassagne.bib&jsonp=1&group0=type&theme=side&owner=Cassagne,%20A&titleLinks=true\");\n print_r($contents);\n ?>\n \n
\n\n iFrame\n (not recommended)\n
\n \n <iframe src=\"https://bibbase.org/show?bib=https://largo.lip6.fr/~cassagnea/data/Cassagne.bib&jsonp=1&group0=type&theme=side&owner=Cassagne,%20A&titleLinks=true\"></iframe>\n \n
\n\n

\n For more details see the documention.\n

\n
\n
\n\n
\n\n This is a preview! To use this list on your own web site\n or create a new web site from it,\n create a free account. The file will be added\n and you will be able to edit it in the File Manager.\n We will show you instructions once you've created your account.\n
\n\n
\n\n

To the site owner:

\n\n

Action required! Mendeley is changing its\n API. In order to keep using Mendeley with BibBase past April\n 14th, you need to:\n

    \n
  1. renew the authorization for BibBase on Mendeley, and
  2. \n
  3. update the BibBase URL\n in your page the same way you did when you initially set up\n this page.\n
  4. \n
\n

\n\n

\n \n \n Fix it now\n

\n
\n\n
\n\n\n
\n \n \n
\n
\n  \n article\n \n \n (5)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n A DSEL for High Throughput and Low Latency Software-Defined Radio on Multicore CPUs.\n \n \n\n\n \n Cassagne, A.; Tajan, R.; Aumage, O.; Barthou, D.; Leroux, C.; and Jégo, C.\n\n\n \n\n\n\n Wiley Concurrency and Computation: Practice and Experience (CCPE), 35(23): e7820. July 2023.\n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n \n \"A link\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@Article{Cassagne2023,\n  author      = {A. Cassagne and R. Tajan and O. Aumage and D. Barthou and C. Leroux and C. J\\'ego},\n  journal     = {Wiley Concurrency and Computation: Practice and Experience (CCPE)},\n  title       = {A {DSEL} for High Throughput and Low Latency Software-Defined Radio on Multicore {CPU}s},\n  year        = {2023},\n  month       = jul,\n  number      = {23},\n  pages       = {e7820},\n  volume      = {35},\n  abstract    = {This article presents a new Domain Specific Embedded Language (DSEL) dedicated to Software-Defined Radio (SDR). From a set of carefully designed components, it enables to build efficient software digital communication systems, able to take advan- tage of the parallelism of modern processor architectures, in a straightforward and safe manner for the programmer. In particular, proposed DSEL enables the combination of pipelining and sequence duplication techniques to extract both temporal and spatial parallelism from digital communication systems. We leverage the DSEL capabilities on a real use case: a fully digital transceiver for the widely used DVB-S2 standard designed entirely in software. Through evaluation, we show how proposed software DVB-S2 transceiver is able to get the most from modern, high-end multi- core CPU targets.},\n  doi         = {10.1002/cpe.7820},\n  hal_id      = {hal-04156404},\n  hal_version = {v3},\n  keywords    = {DSEL, SDR, Multicore CPUs, Pipeline, Real-time system, DVB-S2 transceiver},\n  url_Paper   = {https://hal.science/hal-04156404v3/file/Cassagne2023%20-%20A%20DSEL%20for%20High%20Throughput%20and%20Low%20Latency%20Software-Defined%20Radio%20on%20Multicore%20CPUs%20%5Bpreprint%5D.pdf},\n  url_Link    = {https://onlinelibrary.wiley.com/doi/10.1002/cpe.7820},\n}\n\n
\n
\n\n\n
\n This article presents a new Domain Specific Embedded Language (DSEL) dedicated to Software-Defined Radio (SDR). From a set of carefully designed components, it enables to build efficient software digital communication systems, able to take advan- tage of the parallelism of modern processor architectures, in a straightforward and safe manner for the programmer. In particular, proposed DSEL enables the combination of pipelining and sequence duplication techniques to extract both temporal and spatial parallelism from digital communication systems. We leverage the DSEL capabilities on a real use case: a fully digital transceiver for the widely used DVB-S2 standard designed entirely in software. Through evaluation, we show how proposed software DVB-S2 transceiver is able to get the most from modern, high-end multi- core CPU targets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n A 2022 τ-Herculid Meteor Cluster from an Airborne Experiment: Automated Detection, Characterization, and Consequences for Meteoroids.\n \n \n\n\n \n Vaubaillon, J.; Loir, C.; Ciocan, C.; Kandeepan, M.; Millet, M.; Cassagne, A.; Lacassagne, L.; da Fonseca, P.; Zander, F.; Buttsworth, D.; Loehle, S.; Tóth, J.; Gray, S.; Moingeon, A.; and Rambaux, N.\n\n\n \n\n\n\n Astronomy and Astrophysics (A&A). February 2023.\n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n \n \"A link\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@Article{Vaubaillon2023,\n  author      = {Vaubaillon, J{\\'e}r{\\'e}mie and Loir, Charlotte and Ciocan, Clara and Kandeepan, Mathuran and Millet, Maxime and Cassagne, Adrien and Lacassagne, Lionel and da Fonseca, Pedro and Zander, Fabian and Buttsworth, David and Loehle, Stefan and T{\\'o}th, Juraj and Gray, Scott and Moingeon, Audrey and Rambaux, Nicolas},\n  journal     = {Astronomy and Astrophysics (A\\&A)},\n  title       = {A 2022 \\tau-Herculid Meteor Cluster from an Airborne Experiment: Automated Detection, Characterization, and Consequences for Meteoroids},\n  year        = {2023},\n  month       = feb,\n  abstract    = {Context. The existence of meteor clusters has long since been a subject of speculation and so far only seven events have been reported, among which two involve less than five meteors, and three were seen during the Leonid storms.\nAims. The 1995 outburst of Comet 73P/Schwassmann-Wachmann was predicted to result in a meteor shower in May 2022. We detected the shower, proved this to be the result of this outburst, and detected another meteor cluster during the same observation mission.\nMethods. The tau-Herculids meteor shower outburst on 31 May 2022 was continuously monitored for 4 hours during an airborne campaign. The video data were analyzed using a recently developed computer-vision processing chain for meteor real-time detection. Results. We report and characterize the detection of a meteor cluster involving 38 fragments, detected at 06:48 UT for a total duration of 11.3 s. The derived cumulative size frequency distribution index is relatively shallow: s = 3.1. Our open-source computer-vision processing chain (named FMDT) detects 100\\% of the meteors that a human eye is able to detect in the video. Classical automated motion detection assuming a static camera was not suitable for the stabilized camera setup because of residual motion.\nConclusions. From all reported meteor clusters, we crudely estimate their occurrence to be less than one per million observed meteors. Low heliocentric distance enhances the probability of such meteoroid self-disruption in the interplanetary space.},\n  doi         = {10.1051/0004-6361/202244993},\n  hal_id      = {hal-03956836},\n  hal_version = {v1},\n  keywords    = {meteoroids, Comets: general, method: data analysis, meteoroids},\n  publisher   = {{EDP Sciences}},\n  url_Paper   = {https://hal.science/hal-03956836/file/article.pdf},\n  url_Link    = {https://www.aanda.org/articles/aa/full_html/2023/02/aa44993-22/aa44993-22.html},\n}\n\n
\n
\n\n\n
\n Context. The existence of meteor clusters has long since been a subject of speculation and so far only seven events have been reported, among which two involve less than five meteors, and three were seen during the Leonid storms. Aims. The 1995 outburst of Comet 73P/Schwassmann-Wachmann was predicted to result in a meteor shower in May 2022. We detected the shower, proved this to be the result of this outburst, and detected another meteor cluster during the same observation mission. Methods. The tau-Herculids meteor shower outburst on 31 May 2022 was continuously monitored for 4 hours during an airborne campaign. The video data were analyzed using a recently developed computer-vision processing chain for meteor real-time detection. Results. We report and characterize the detection of a meteor cluster involving 38 fragments, detected at 06:48 UT for a total duration of 11.3 s. The derived cumulative size frequency distribution index is relatively shallow: s = 3.1. Our open-source computer-vision processing chain (named FMDT) detects 100% of the meteors that a human eye is able to detect in the video. Classical automated motion detection assuming a static camera was not suitable for the stabilized camera setup because of residual motion. Conclusions. From all reported meteor clusters, we crudely estimate their occurrence to be less than one per million observed meteors. Low heliocentric distance enhances the probability of such meteoroid self-disruption in the interplanetary space.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n AFF3CT: A Fast Forward Error Correction Toolbox!.\n \n \n\n\n \n Cassagne, A.; Hartmann, O.; Léonardon, M.; He, K.; Leroux, C.; Tajan, R.; Aumage, O.; Barthou, D.; Tonnellier, T.; Pignoly, V.; Le Gal, B.; and Jégo, C.\n\n\n \n\n\n\n Elsevier SoftwareX, 10: 100345. October 2019.\n \n\n\n\n
\n\n\n\n \n \n \"AFF3CT: paper\n  \n \n \n \"AFF3CT: link\n  \n \n \n \"AFF3CT: slides\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@Article{Cassagne2019a,\n  author     = {A. Cassagne and O. Hartmann and M. L\\'eonardon and K. He and C. Leroux and R. Tajan and O. Aumage and D. Barthou and T. Tonnellier and V. Pignoly and B. {Le Gal} and C. J\\'ego},\n  title      = {{AFF3CT}: A Fast Forward Error Correction Toolbox!},\n  journal    = {Elsevier SoftwareX},\n  year       = {2019},\n  volume     = {10},\n  pages      = {100345},\n  month      = oct,\n  issn       = {2352-7110},\n  abstract   = {AFF3CT is an open source toolbox dedicated to Forward Error Correction (FEC or channel coding). It supports a broad range of codes: from widespread turbo codes and Low-Density Parity-Check (LDPC) codes to more recent polar codes. The toolbox is written in C++ and can be used either as a simulator to quickly evaluate algorithms characteristics, or as a library in Software Defined Radio (SDR) systems or for other specific needs. Most of the decoding algorithm implementations aim at low latency and high throughput, targeting multiple Gb/s on modern CPUs. This is crucial in both simulation and SDR use cases: Monte Carlo simulations require high performance implementation as they commonly target the estimation of approximately 10^{12} bits. On the other hand, the implementations in real systems have to be very efficient to be competitive against dedicated hardware ones. Finally, AFF3CT emphasizes the reproducibility of state-of-the-art results by providing public references and open, modular source code.},\n  doi        = {10.1016/j.softx.2019.100345},\n  keywords   = {Communication chain, Channel coding, Monte Carlo simulation, Forward error correction library, Digital modulation, Reproducible science, Multi-node, Multi-thread, Vectorization},\n  url_Paper  = {https://inria.hal.science/hal-02358306v1/file/Cassagne2019a%20-%20AFF3CT%3A%20A%20Fast%20Forward%20Error%20Correction%20Toolbox.pdf},\n  url_Link   = {https://www.sciencedirect.com/science/article/pii/S2352711019300457},\n  url_Slides = {https://www.researchgate.net/profile/Adrien-Cassagne/publication/334030314_AFF3CT_A_Fast_Forward_Error_Correction_Toolbox/links/5d133e2fa6fdcc2462a685b5/AFF3CT-A-Fast-Forward-Error-Correction-Toolbox.pdf?origin=publicationDetail&_sg%5B0%5D=j_FLCBMcq46Lq_JgDLiiWnc5KYHuOgTaDZWAZbc_XoIcMP1BaZfUzLfgjn3wBm5q4oqowGNJHFz4FSHwciW91g.oVw1RgX5dD1DlgoaH9ygbYwO0lGImgg60wtoNVfkuhk9kWxljq1MqHuaVa4SCQ2k71arShRp3gy0TNsclhWsqQ&_sg%5B1%5D=pDjfzMu22Ta3fwX7qX9a5xFK3_2MMhjHWd6ZED6wzue-filk8V6BM-tAurkmYmod6bLUgUgQPEjdkGGs2QLEJDCdk7EHN-4rNpAg9HXIc-1M.oVw1RgX5dD1DlgoaH9ygbYwO0lGImgg60wtoNVfkuhk9kWxljq1MqHuaVa4SCQ2k71arShRp3gy0TNsclhWsqQ&_sg%5B2%5D=_Y6fY6bhyIX2Gmk7jVFwqGRxKS1iki-l0_spQiD_GZsQsWSDKNX5Uxf3yqX2qCDGltHvsk_ZYfkNwJo.sAmj_v1sTluoJHTo0snCLQ-15uD6YzVplIadIIelvHuYzybPY-43hWhLyvuf6VS2rlQj4YOwyHrVi60qe48jRA&_iepl=&_rtd=eyJjb250ZW50SW50ZW50IjoibWFpbkl0ZW0ifQ%3D%3D&_tp=eyJjb250ZXh0Ijp7ImZpcnN0UGFnZSI6Il9kaXJlY3QiLCJwYWdlIjoicHVibGljYXRpb24iLCJwcmV2aW91c1BhZ2UiOiJwcm9maWxlIiwicG9zaXRpb24iOiJwYWdlSGVhZGVyIn19},\n}\n\n
\n
\n\n\n
\n AFF3CT is an open source toolbox dedicated to Forward Error Correction (FEC or channel coding). It supports a broad range of codes: from widespread turbo codes and Low-Density Parity-Check (LDPC) codes to more recent polar codes. The toolbox is written in C++ and can be used either as a simulator to quickly evaluate algorithms characteristics, or as a library in Software Defined Radio (SDR) systems or for other specific needs. Most of the decoding algorithm implementations aim at low latency and high throughput, targeting multiple Gb/s on modern CPUs. This is crucial in both simulation and SDR use cases: Monte Carlo simulations require high performance implementation as they commonly target the estimation of approximately 10^12 bits. On the other hand, the implementations in real systems have to be very efficient to be competitive against dedicated hardware ones. Finally, AFF3CT emphasizes the reproducibility of state-of-the-art results by providing public references and open, modular source code.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n Toward High Performance Implementation of 5G SCMA Algorithms.\n \n \n\n\n \n Ghaffari, A.; Léonardon, M.; Cassagne, A.; Leroux, C.; and Savaria, Y.\n\n\n \n\n\n\n IEEE Access, 7: 10402–10414. January 2019.\n \n\n\n\n
\n\n\n\n \n \n \"Toward paper\n  \n \n \n \"Toward link\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@Article{Ghaffari2019,\n  author    = {A. Ghaffari and M. L\\'eonardon and A. Cassagne and C. Leroux and Y. Savaria},\n  journal   = {IEEE Access},\n  title     = {Toward High Performance Implementation of {5G} {SCMA} Algorithms},\n  year      = {2019},\n  issn      = {2169-3536},\n  pages     = {10402--10414},\n  month     = jan,\n  volume    = {7},\n  abstract  = {The recent evolution of mobile communication systems toward a 5G network is associated with the search for new types of non-orthogonal modulations such as Sparse Code Multiple Access (SCMA). Such modulations are proposed in response to demands for increasing the number of connected users. SCMA is a non-orthogonal multiple access technique that offers improved Bit-Error Rate (BER) performance and higher spectral efficiency than other comparable techniques, but these improvements come at the cost of complex decoders. There are many challenges in designing near-optimum high throughput SCMA decoders. This paper explores means to enhance the performance of SCMA decoders. To achieve this goal, in this paper, various improvements to the MPA algorithms are proposed. They notably aim at adapting SCMA decoding to the Single Instruction Multiple Data (SIMD) paradigm. An approximate modeling of noise is performed to reduce the complexity of floating point calculations. The effects of forward error corrections such as Polar codes, Turbo codes and LDPC, as well as different ways of accessing memory and improving power efficiency of  modified MPAs are investigated. The results show that the throughput of a SCMA decoder can be increased by 3.1 to 21 times when compared to the original MPA on different computing platforms using the suggested improvements.},\n  doi       = {10.1109/ACCESS.2019.2891597},\n  keywords  = {5G, SCMA, Maximum Likelihood (ML), Message Passing Algorithm (MPA), log-MPA, iterative multi-user detection, BER, Single Instruction Multiple Data (SIMD), Intel Advanced Vector Extensions (AVX), Streaming SIMD Extension (SSE), Knights Corner Instruction (KNCI), power efficiency, exponential estimations},\n  url_Paper = {https://hal.science/hal-01977885v1/file/Ghaffari2019%20-%20Toward%20High%20Performance%20Implementation%20of%205G%20SCMA%20Algorithms.pdf},\n  url_Link  = {https://ieeexplore.ieee.org/document/8606081},\n}\n\n
\n
\n\n\n
\n The recent evolution of mobile communication systems toward a 5G network is associated with the search for new types of non-orthogonal modulations such as Sparse Code Multiple Access (SCMA). Such modulations are proposed in response to demands for increasing the number of connected users. SCMA is a non-orthogonal multiple access technique that offers improved Bit-Error Rate (BER) performance and higher spectral efficiency than other comparable techniques, but these improvements come at the cost of complex decoders. There are many challenges in designing near-optimum high throughput SCMA decoders. This paper explores means to enhance the performance of SCMA decoders. To achieve this goal, in this paper, various improvements to the MPA algorithms are proposed. They notably aim at adapting SCMA decoding to the Single Instruction Multiple Data (SIMD) paradigm. An approximate modeling of noise is performed to reduce the complexity of floating point calculations. The effects of forward error corrections such as Polar codes, Turbo codes and LDPC, as well as different ways of accessing memory and improving power efficiency of modified MPAs are investigated. The results show that the throughput of a SCMA decoder can be increased by 3.1 to 21 times when compared to the original MPA on different computing platforms using the suggested improvements.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n Fast and Flexible Software Polar List Decoders.\n \n \n\n\n \n Léonardon, M.; Cassagne, A.; Leroux, C.; Jégo, C.; Hamelin, L.; and Savaria, Y.\n\n\n \n\n\n\n Springer Journal of Signal Processing Systems (JSPS), 91: 937–952. January 2019.\n \n\n\n\n
\n\n\n\n \n \n \"Fast paper\n  \n \n \n \"Fast link\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@Article{Leonardon2019,\n  author    = {M. L\\'eonardon and A. Cassagne and C. Leroux and C. J\\'ego and L-P. Hamelin and Y. Savaria},\n  title     = {Fast and Flexible Software Polar List Decoders},\n  journal   = {Springer Journal of Signal Processing Systems (JSPS)},\n  year      = {2019},\n  volume    = {91},\n  pages     = {937--952},\n  month     = jan,\n  issn      = {1939-8115},\n  abstract  = {Flexibility is one mandatory aspect of channel coding in modern wireless communication systems. Among other things, the channel decoder has to support several code lengths and code rates. This need for flexibility applies to polar codes that are considered for control channels in the future 5G standard. This paper presents a new generic and flexible implementation of a software Successive Cancellation List (SCL) decoder. A large set of parameters can be fine-tuned dynamically without re-compiling the software source code: the code length, the code rate, the frozen bits set, the puncturing patterns, the cyclic redundancy check, the list size, the type of decoding algorithm, the tree-pruning strategy and the data quantization. This generic and flexible SCL decoder enables to explore tradeoffs between throughput, latency and decoding performance. Several optimizations are proposed to achieve a competitive decoding speed despite the constraints induced by the genericity and the flexibility. The resulting polar list decoder is about 4 times faster than a generic software decoder and only 2 times slower than a non-flexible unrolled decoder. Thanks to the flexibility of the decoder, the fully adaptive SCL algorithm can be easily implemented and achieves higher throughput than any other similar decoder in the literature (up to 425 Mb/s on a single processor core for N = 2048 and K = 1723 at 4.5 dB).},\n  day       = {18},\n  doi       = {10.1007/s11265-018-1430-3},\n  keywords  = {Polar codes, Adaptive successive cancellation list decoder, Software implementation, 5G standard, Generic decoder, Flexible decoder},\n  url_Paper = {https://inria.hal.science/hal-01987848v1/file/Leonardon2017%20-%20Fast%20and%20Flexible%20Software%20Polar%20List%20Decoders.pdf},\n  url_Link  = {https://link.springer.com/article/10.1007/s11265-018-1430-3},\n}\n\n
\n
\n\n\n
\n Flexibility is one mandatory aspect of channel coding in modern wireless communication systems. Among other things, the channel decoder has to support several code lengths and code rates. This need for flexibility applies to polar codes that are considered for control channels in the future 5G standard. This paper presents a new generic and flexible implementation of a software Successive Cancellation List (SCL) decoder. A large set of parameters can be fine-tuned dynamically without re-compiling the software source code: the code length, the code rate, the frozen bits set, the puncturing patterns, the cyclic redundancy check, the list size, the type of decoding algorithm, the tree-pruning strategy and the data quantization. This generic and flexible SCL decoder enables to explore tradeoffs between throughput, latency and decoding performance. Several optimizations are proposed to achieve a competitive decoding speed despite the constraints induced by the genericity and the flexibility. The resulting polar list decoder is about 4 times faster than a generic software decoder and only 2 times slower than a non-flexible unrolled decoder. Thanks to the flexibility of the decoder, the fully adaptive SCL algorithm can be easily implemented and achieves higher throughput than any other similar decoder in the literature (up to 425 Mb/s on a single processor core for N = 2048 and K = 1723 at 4.5 dB).\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n conference\n \n \n (8)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n A New Meteor Detection Application Robust to Camera Movements.\n \n \n\n\n \n Ciocan, C.; Kandeepan, M.; Cassagne, A.; Vaubaillon, J.; Zander, F.; and Lacassagne, L.\n\n\n \n\n\n\n August 2023.\n Groupe de Recherche et d'Études de Traitement du Signal et des Images (GRETSI). In french.\n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n \n \"A link\n  \n \n \n \"A slides\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@Conference{Ciocan2023,\n  author      = {Ciocan, Clara and Kandeepan, Mathuran and Cassagne, Adrien and Vaubaillon, J{\\'e}r{\\'e}mie and Zander, Fabian and Lacassagne, Lionel},\n  booktitle   = {Groupe de Recherche et d'{\\'E}tudes de Traitement du Signal et des Images (GRETSI)},\n  title       = {A New Meteor Detection Application Robust to Camera Movements},\n  year        = {2023},\n  address     = {Grenoble, France},\n  month       = aug,\n  note        = {Groupe de Recherche et d'{\\'E}tudes de Traitement du Signal et des Images (GRETSI). In french.},\n  abstract    = {This article presents a new tool for the automatic detection of meteors. Fast Meteor Detection Toolbox (FMDT) is able to detect meteor sightings by analyzing videos acquired by cameras onboard weather balloons or within airplane with stabilization. The challenge consists in designing a processing chain composed of simple algorithms, that are robust to the high fluctuation of the videos and that satisfy the constraints on power consumption (10 W) and real-time processing (25 frames per second).},\n  doi         = {10.48550/arXiv.2309.06027},\n  hal_id      = {hal-04198536},\n  hal_version = {v1},\n  url_Paper   = {https://gretsi.fr/data/colloque/pdf/2023_ciocan1190.pdf},\n  url_Link    = {https://hal.science/hal-04198536},\n  url_Slides  = {https://hal.science/hal-04198536v1/file/Ciocan2023%20-%20Une%20nouvelle%20application%20de%20detection%20de%20meteores%20robuste%20aux%20mouvements%20de%20camera%20%5Bposter%5D.pdf},\n}\n\n
\n
\n\n\n
\n This article presents a new tool for the automatic detection of meteors. Fast Meteor Detection Toolbox (FMDT) is able to detect meteor sightings by analyzing videos acquired by cameras onboard weather balloons or within airplane with stabilization. The challenge consists in designing a processing chain composed of simple algorithms, that are robust to the high fluctuation of the videos and that satisfy the constraints on power consumption (10 W) and real-time processing (25 frames per second).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n Parallelization of a New Embedded Application for Automatic Meteor Detection.\n \n \n\n\n \n Kandeepan, M.; Ciocan, C.; Cassagne, A.; and Lacassagne, L.\n\n\n \n\n\n\n July 2023.\n Conférence d'informatique en Parallélisme, Architecture et Système (COMPAS). In french.\n\n\n\n
\n\n\n\n \n \n \"Parallelization paper\n  \n \n \n \"Parallelization link\n  \n \n \n \"Parallelization slides\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@Conference{Kandeepan2023,\n  author     = {Kandeepan, Mathuran and Ciocan, Clara and Cassagne, Adrien and Lacassagne, Lionel},\n  booktitle  = {Conf{\\'e}rence d'informatique en Parall{\\'e}lisme, Architecture et Syst{\\`e}me (COMPAS)},\n  title      = {Parallelization of a New Embedded Application for Automatic Meteor Detection},\n  year       = {2023},\n  address    = {Annecy, France},\n  month      = jul,\n  note       = {Conf{\\'e}rence d'informatique en Parall{\\'e}lisme, Architecture et Syst{\\`e}me (COMPAS). In french.},\n  abstract   = {This article presents the methods used to parallelize a new computer vision application. The system is able to automatically detect meteor from non-stabilized cameras and noisy video sequences. The application is designed to be embedded in weather balloons or for airborne observation campaigns. Thus, the final target is a low power system-on-chip (< 10 Watts) while the software needs to compute a stream of frames in real-time (> 25 frames per second). For this, first the application is split in a tasks graph, then different parallelization techniques are applied. Experiment results demonstrate the efficiency of the parallelization methods. For instance, on the Raspberry Pi 4 and on a HD video sequence, the processing chain reaches 42 frames per second while it only consumes 6 Watts.},\n  doi        = {10.48550/arXiv.2307.10632},\n  url_Paper  = {https://hal.science/hal-04164359v1/file/article.pdf},\n  url_Link   = {https://hal.science/hal-04164359v1},\n  url_Slides = {https://hal.science/hal-04164359v1/file/Kandeepan2023%20-%20Parallelisation%20dune%20nouvelle%20application%20embarquee%20pour%20la%20detection%20automatique%20de%20meteores%20%5Bslides%5D.pdf},\n}\n\n
\n
\n\n\n
\n This article presents the methods used to parallelize a new computer vision application. The system is able to automatically detect meteor from non-stabilized cameras and noisy video sequences. The application is designed to be embedded in weather balloons or for airborne observation campaigns. Thus, the final target is a low power system-on-chip (< 10 Watts) while the software needs to compute a stream of frames in real-time (> 25 frames per second). For this, first the application is split in a tasks graph, then different parallelization techniques are applied. Experiment results demonstrate the efficiency of the parallelization methods. For instance, on the Raspberry Pi 4 and on a HD video sequence, the processing chain reaches 42 frames per second while it only consumes 6 Watts.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n A 2022 τ-Herculids meteor cluster.\n \n \n\n\n \n Vaubaillon, J.; Loir, C.; Millet, M.; Ciocan, C.; Kandeepan, M.; Cassagne, A.; Lacassagne, L.; da Fonseca, P.; Zander, F.; Buttsworth, D.; Loehle, S.; Tóth, J.; Gray-Owen, S. D; Moingeon, A.; and Rambaux, N.\n\n\n \n\n\n\n September 2022.\n Internationational Meteor Conference (IMC).\n\n\n\n
\n\n\n\n \n \n \"A link\n  \n \n \n \"A slides\n  \n \n\n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@Conference{Vaubaillon2022,\n  title        = {A 2022 \\tau-Herculids meteor cluster},\n  author       = {Vaubaillon, Jeremie and Loir, Charlotte and Millet, Maxime and Ciocan, Clara and Kandeepan, Mathuran and Cassagne, Adrien and Lacassagne, Lionel and da Fonseca, Pedro and Zander, Fabian and Buttsworth, David and Loehle, Stefan and T{\\'o}th, Juraj and Gray-Owen, Scott D and Moingeon, Audrey and Rambaux, Nicolas},\n  booktitle    = {Internationational Meteor Conference (IMC)},\n  note         = {Internationational Meteor Conference (IMC).},\n  address      = {Poroszlo, Hungary},\n  organization = {{Research Centre for Astronomy and Earth Sciences (CSFK) and Konkoly Thege Astronomical Institut}},\n  year         = {2022},\n  month        = Sep,\n  keywords     = {Meteor shower ; Meteor tracking ; Computer vision ; Realtime image processing},\n  abstract     = {Airborne observation (see J. Toth's talk) Mobile cameras from the MoMet device (see Da Fonseca's talk, IMC2021) 2 stabilized cameras + RBpi + RMS software 2 unstabilized cameras + continuous recording on Mac Basler acA1920-155um + 6mm f/1.4 or 12mm f/1.6 lens ; 20 fps },\n  hal_id       = {hal-03838128},\n  hal_version  = {v1},\n  url_Link     = {https://hal.science/hal-03838128},\n  url_Slides   = {https://hal.science/hal-03838128v1/file/2022-TAH%20Cluster.pdf},\n}\n\n
\n
\n\n\n
\n Airborne observation (see J. Toth's talk) Mobile cameras from the MoMet device (see Da Fonseca's talk, IMC2021) 2 stabilized cameras + RBpi + RMS software 2 unstabilized cameras + continuous recording on Mac Basler acA1920-155um + 6mm f/1.4 or 12mm f/1.6 lens ; 20 fps \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n Fast Meteor Detection Toolbox.\n \n \n\n\n \n Kandeepan, M.; Ciocan, C.; Millet, M.; Bouyer, M.; Cassagne, A.; and Lacassagne, L.\n\n\n \n\n\n\n November 2022.\n Journée AFF3CT. Poster.\n\n\n\n
\n\n\n\n \n \n \"Fast slides\n  \n \n \n \"Fast link\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@Conference{Kandeepan2022,\n  author       = {Kandeepan, Mathuran and Ciocan, Clara and Millet, Maxime and Bouyer, Manuel and Cassagne, Adrien and Lacassagne, Lionel},\n  booktitle    = {Journ{\\'e}e AFF3CT},\n  title        = {Fast Meteor Detection Toolbox},\n  year         = {2022},\n  month        = Nov,\n  note         = {Journ{\\'e}e AFF3CT. Poster.},\n  organization = {{Centre Inria de l'Universit{\\'e} de Bordeaux}},\n  abstract     = {Detection and characterization of meteoroids and space debris that enter into our atmosphere is one of the main concerns of astronomers. This work presents a computer vision application to detect meteors from video sequences. This application is designed to be embedded in satellites, in weather balloons or for airborne observation campaigns. The system runs on CPUs and is robust to non-stabilized cameras and noisy video sequences. It is evaluated on the 2022 Tau-Herculids video sequence where an overall tracking rate of 80.4\\% is obtained. All the visible meteors are detected. Experiment results demonstrate that the system runs efficiently with limited power thanks to the use of multithreading techniques (pipeline and fork-join parallelism). For example, on the 2022 Tau-Herculids HD video sequence, the system reaches 30 FPS on the Raspberry Pi 4 while the instant power is only 3.8 Watts.},\n  doi          = {10.13140/RG.2.2.12222.36161},\n  hal_id       = {hal-03954992},\n  hal_version  = {v1},\n  keywords     = {computer vision, Multithreading, meteor cluster, embedded system, pipeline parallelism, meteor tracking, tasks graph, data flow},\n  url_Slides   = {https://hal.science/hal-03954992/file/poster_fmdt.pdf},\n  url_Link     = {https://hal.science/hal-03954992},\n}\n\n
\n
\n\n\n
\n Detection and characterization of meteoroids and space debris that enter into our atmosphere is one of the main concerns of astronomers. This work presents a computer vision application to detect meteors from video sequences. This application is designed to be embedded in satellites, in weather balloons or for airborne observation campaigns. The system runs on CPUs and is robust to non-stabilized cameras and noisy video sequences. It is evaluated on the 2022 Tau-Herculids video sequence where an overall tracking rate of 80.4% is obtained. All the visible meteors are detected. Experiment results demonstrate that the system runs efficiently with limited power thanks to the use of multithreading techniques (pipeline and fork-join parallelism). For example, on the 2022 Tau-Herculids HD video sequence, the system reaches 30 FPS on the Raspberry Pi 4 while the instant power is only 3.8 Watts.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n Meteorix: High performance computer vision application for Meteor detection from a CubeSat.\n \n \n\n\n \n Millet, M.; Rambaux, N.; Cassagne, A.; Bouyer, M.; Petreto, A.; and Lacassagne, L.\n\n\n \n\n\n\n July 2022.\n Scientific Assembly of the Committee on Space Research (COSPAR).\n\n\n\n
\n\n\n\n \n \n \"Meteorix: link\n  \n \n \n \"Meteorix: slides\n  \n \n\n \n\n \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@Conference{Millet2022,\n  title       = {Meteorix: High performance computer vision application for Meteor detection from a CubeSat},\n  author      = {Millet, Maxime and Rambaux, Nicolas and Cassagne, Adrien and Bouyer, Manuel and Petreto, Andrea and Lacassagne, Lionel},\n  booktitle   = {Scientific Assembly of the Committee on Space Research (COSPAR)},\n  note        = {Scientific Assembly of the Committee on Space Research (COSPAR).},\n  address     = {Athens, Greece},\n  year        = {2022},\n  month       = Jul,\n  hal_id      = {hal-03737605},\n  hal_version = {v1},\n  url_Link    = {https://hal.science/hal-03737605},\n  url_Slides  = {https://hal.science/hal-03737605v1/file/COSPAR22.pdf},\n}\n\n
\n
\n\n\n\n
\n\n\n \n\n\n
\n \n\n \n \n \n \n AFF3CT : Un environnement de simulation pour le codage de canal.\n \n \n\n\n \n Cassagne, A.; Léonardon, M.; Hartmann, O.; Tonnellier, T.; Delbergue, G.; Giraud, V.; Leroux, C.; Tajan, R.; Le Gal, B.; Jégo, C.; Aumage, O.; and Barthou, D.\n\n\n \n\n\n\n June 2017.\n GdR SoC2. In french.\n\n\n\n
\n\n\n\n \n \n \"AFF3CT paper\n  \n \n \n \"AFF3CT link\n  \n \n \n \"AFF3CT slides\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@Conference{Cassagne2017,\n  author     = {A. Cassagne and M. L\\'eonardon and O. Hartmann and T. Tonnellier and G. Delbergue and V. Giraud and C. Leroux and R. Tajan and B. {Le Gal} and C. J\\'ego and O. Aumage and D. Barthou},\n  title      = {{AFF3CT} : Un environnement de simulation pour le codage de canal},\n  booktitle  = {GdR SoC2},\n  note       = {GdR SoC2. In french.},\n  year       = {2017},\n  month      = jun,\n  abstract   = {Dans cet article nous présentons un environnement de simulation de Monte Carlo pour les systèmes de communications numériques. Nous nous focalisons en particulier sur les fonctions associées au codage de\ncanal. Après avoir présenté les enjeux liés à la simulation, nous identifions trois problèmes inhérents à ce type de simulation. Puis nous présentons les principales caractéristiques de l’environnement AFF3CT.},\n  doi        = {10.13140/RG.2.2.13492.91520},\n  url_Paper  = {https://hal.science/hal-01965629v1/file/Cassagne2017%20-%20AFF3CT%20_%20Un%20environnement%20de%20simulation%20pour%20le%20codage%20de%20canal.pdf},\n  url_Link   = {https://hal.science/hal-01965629v1},\n  url_Slides = {https://hal.science/hal-01965629v1/file/Cassagne2017%20-%20AFF3CT%20_%20Un%20environnement%20de%20simulation%20pour%20le%20codage%20de%20canal%20%5Bposter%5D.pdf},\n}\n\n
\n
\n\n\n
\n Dans cet article nous présentons un environnement de simulation de Monte Carlo pour les systèmes de communications numériques. Nous nous focalisons en particulier sur les fonctions associées au codage de canal. Après avoir présenté les enjeux liés à la simulation, nous identifions trois problèmes inhérents à ce type de simulation. Puis nous présentons les principales caractéristiques de l’environnement AFF3CT.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n JAGUAR: A New CFD Code Dedicated to Massively Parallel High-Order LES Computations on Complex Geometry.\n \n \n\n\n \n Cassagne, A.; Boussuge, J.; Puigt, G.; Villedieu, N.; D'Ast, I.; and Genot, A.\n\n\n \n\n\n\n April 2015.\n International Conference on Applied Aerodynamics (AERO).\n\n\n\n
\n\n\n\n \n \n \"JAGUAR: paper\n  \n \n \n \"JAGUAR: link\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@Conference{Cassagne2015,\n  author       = {A. Cassagne and J-F. Boussuge and G. Puigt and N. Villedieu and I. {D'Ast} and A. Genot},\n  title        = {{JAGUAR}: A New {CFD} Code Dedicated to Massively Parallel High-Order {LES} Computations on Complex Geometry},\n  booktitle    = {International Conference on Applied Aerodynamics (AERO)},\n  note         = {International Conference on Applied Aerodynamics (AERO).},\n  year         = {2015},\n  address      = {Toulouse, France},\n  month        = apr,\n  organization = {3AF},\n  abstract     = {LES of industrial flows is associated with geometrical complexity and requires high order schemes to minimize dissipation and dispersion. To tackle these two issues it is necessary to use unstructured grids and High Performance Computing algorithms. In this context, CERFACS initiated two years ago the development of a new CFD code called JAGUAR based on a mathematical framework leading to high-level capability for LES. In this paper, many topics for HPC are introduced and solved in order to obtain the best code performance.},\n  doi          = {10.6084/m9.figshare.12173466.v1},\n  url_Paper    = {https://hal.science/hal-01965640v1/file/Cassagne2015%20-%20JAGUAR%3A%20a%20New%20CFD%20Code%20Dedicated%20to%20Massively%20Parallel%20High-Order%20LES%20Computations%20on%20Complex%20Geometry.pdf},\n  url_Link     = {https://hal.science/hal-01965640v1},\n}\n\n
\n
\n\n\n
\n LES of industrial flows is associated with geometrical complexity and requires high order schemes to minimize dissipation and dispersion. To tackle these two issues it is necessary to use unstructured grids and High Performance Computing algorithms. In this context, CERFACS initiated two years ago the development of a new CFD code called JAGUAR based on a mathematical framework leading to high-level capability for LES. In this paper, many topics for HPC are introduced and solved in order to obtain the best code performance.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n inproceedings\n \n \n (6)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n Real-time and Approximate Iterative Optical Flow Implementation on Low-power Embedded CPUs.\n \n \n\n\n \n Millet, M.; Cassagne, A.; Rambaux, N.; and Lacassagne, L.\n\n\n \n\n\n\n In International Conference on Application-specific Systems, Architectures, and Processors (ASAP), pages 135–138, Porto, Portugal, July 2023. IEEE\n \n\n\n\n
\n\n\n\n \n \n \"Real-time paper\n  \n \n \n \"Real-time link\n  \n \n \n \"Real-time slides\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{Millet2023,\n  author      = {Millet, Maxime and Cassagne, Adrien and Rambaux, Nicolas and Lacassagne, Lionel},\n  booktitle   = {International Conference on Application-specific Systems, Architectures, and Processors (ASAP)},\n  title       = {Real-time and Approximate Iterative Optical Flow Implementation on Low-power Embedded {CPU}s},\n  year        = {2023},\n  address     = {Porto, Portugal},\n  month       = Jul,\n  pages       = {135--138},\n  publisher   = {IEEE},\n  abstract    = {Optical flow estimation is used in many embedded computer vision applications, and it is known to be computationally intensive. In the literature, many methods exist to estimate optical flow. Thus, the challenge is to find a method that matches the applicative constraints. In an embedded system, a trade-off between power consumption and execution time has to be made to meet both energy and framerate constraints. This work proposes methods to implement an approximate Horn and Schunck optical flow estimation that meets embedded CPUs constraints. This is achieved thanks to architectural optimizations, software optimizations and algorithm tuning. For instance, on the NVIDIA Jetson Nano, and for HD video sequences, the achieved frame latency is 12 ms for 5 Watts. To the best of our knowledge, this is the fastest optical flow implementation on embedded CPUs.},\n  doi         = {10.1109/ASAP57973.2023.00032},\n  hal_id      = {hal-04247806},\n  hal_version = {v1},\n  keywords    = {computer vision, optical flow, SIMD, approximate computing, low power, tradeofs, embedded systems},\n  url_Paper   = {https://hal.science/hal-04247806v1/file/ASAP_2023.pdf},\n  url_Link    = {https://ieeexplore.ieee.org/document/10265698},\n  url_Slides  = {https://largo.lip6.fr/~lacas/Publications/ASAP23_slides.pdf},\n}\n\n
\n
\n\n\n
\n Optical flow estimation is used in many embedded computer vision applications, and it is known to be computationally intensive. In the literature, many methods exist to estimate optical flow. Thus, the challenge is to find a method that matches the applicative constraints. In an embedded system, a trade-off between power consumption and execution time has to be made to meet both energy and framerate constraints. This work proposes methods to implement an approximate Horn and Schunck optical flow estimation that meets embedded CPUs constraints. This is achieved thanks to architectural optimizations, software optimizations and algorithm tuning. For instance, on the NVIDIA Jetson Nano, and for HD video sequences, the achieved frame latency is 12 ms for 5 Watts. To the best of our knowledge, this is the fastest optical flow implementation on embedded CPUs.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n A Flexible and Portable Real-time DVB-S2 Transceiver using Multicore and SIMD CPUs.\n \n \n\n\n \n Cassagne, A.; Léonardon, M.; Tajan, R.; Leroux, C.; Jégo, C.; Aumage, O.; and Barthou, D.\n\n\n \n\n\n\n In International Symposium on Topics in Coding (ISTC), September 2021. IEEE\n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n \n \"A link\n  \n \n \n \"A slides\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{Cassagne2021,\n  author     = {A. Cassagne and M. L\\'eonardon and R. Tajan and C. Leroux and C. J\\'ego and O. Aumage and D. Barthou},\n  booktitle  = {International Symposium on Topics in Coding (ISTC)},\n  title      = {A Flexible and Portable Real-time {DVB-S2} Transceiver using Multicore and {SIMD} {CPU}s},\n  year       = {2021},\n  month      = sep,\n  publisher  = {IEEE},\n  abstract   = {Software implementation of digital communication systems is more and more used in different contexts. In the case of satellite communication standards, they are an appealing alternative in ground stations. The challenge is to push the performance of these digital communication systems to meet the real time constraints. In this paper, we propose an open source digital communication transceiver that enables to exploit the parallelism of general purpose processors (multicore, SIMD). It is also flexible, supporting several modulation and coding schemes. Finally, it is portable, being able to adapt to the level of parallelism of different CPU architectures (x86 and ARM).},\n  doi        = {10.1109/ISTC49272.2021.9594063},\n  keywords   = {Real-time system, SDR, Multicore CPU, SIMD, DVB-S2 standard, Radio transceiver},\n  url_Paper  = {https://hal.science/hal-03336450v2/file/article.pdf},\n  url_Link   = {https://ieeexplore.ieee.org/document/9594063},\n  url_Slides = {https://lip6.fr/adrien.cassagne/docs/publications/Cassagne2021%20-%20A%20Flexible%20and%20Portable%20Real-time%20DVB-S2%20Transceiver%20using%20Multicore%20and%20SIMD%20CPUs%20%5bvideo%20presentation%5d.mp4},\n}\n\n
\n
\n\n\n
\n Software implementation of digital communication systems is more and more used in different contexts. In the case of satellite communication standards, they are an appealing alternative in ground stations. The challenge is to push the performance of these digital communication systems to meet the real time constraints. In this paper, we propose an open source digital communication transceiver that enables to exploit the parallelism of general purpose processors (multicore, SIMD). It is also flexible, supporting several modulation and coding schemes. Finally, it is portable, being able to adapt to the level of parallelism of different CPU architectures (x86 and ARM).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n MIPP: A Portable C++ SIMD Wrapper and its use for Error Correction Coding in 5G Standard.\n \n \n\n\n \n Cassagne, A.; Aumage, O.; Barthou, D.; Leroux, C.; and Jégo, C.\n\n\n \n\n\n\n In Workshop on Programming Models for SIMD/Vector Processing (WPMVP), Vösendorf/Wien, Austria, February 2018. ACM\n \n\n\n\n
\n\n\n\n \n \n \"MIPP: paper\n  \n \n \n \"MIPP: link\n  \n \n \n \"MIPP: slides\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{Cassagne2018,\n  author     = {A. Cassagne and O. Aumage and D. Barthou and C. Leroux and C. J\\'ego},\n  title      = {{MIPP}: A Portable {C++} {SIMD} Wrapper and its use for Error Correction Coding in {5G} Standard},\n  booktitle  = {Workshop on Programming Models for SIMD/Vector Processing (WPMVP)},\n  year       = {2018},\n  address    = {V\\"osendorf/Wien, Austria},\n  month      = feb,\n  publisher  = {ACM},\n  abstract   = {Error correction code (ECC) processing has so far been performed on dedicated hardware for previous generations of mobile communication standards, to meet latency and bandwidth constraints.\nAs the 5G mobile standard, and its associated channel coding algorithms, are now being specified, modern CPUs are progressing to the point where software channel decoders can viably be contemplated. A key aspect in reaching this transition point is to get the most of CPUs SIMD units on the decoding algorithms being pondered for 5G mobile standards. The nature and diversity of such algorithms requires highly versatile programming tools. This paper demonstrates the virtues and versatility of our MIPP SIMD wrapper in implementing a high performance portfolio of key ECC decoding algorithms.},\n  doi        = {10.1145/3178433.3178435},\n  keywords   = {SIMD, wrapper, C++, channel code, SSE, AVX, AVX-512, NEON},\n  url_Paper  = {https://inria.hal.science/hal-01888010v1/file/article.pdf},\n  url_Link   = {https://dl.acm.org/doi/10.1145/3178433.3178435},\n  url_Slides = {https://www.researchgate.net/profile/Adrien-Cassagne/publication/323535568_Slides_MIPP_WPMVP'18/data/5a9a7bb0a6fdcc3cbac95c3b/slides-MIPP-WPMVP18.pdf?origin=publicationDetail&_sg%5B0%5D=dG7dTbdBOP3hghi0bwKrzp5bxeh7Pp8Qx7insNXSHjwRlmvbipNM93MWfHJKgC0xDjzERYbcfUt4VFAR9ts35A._Qbh7bHF29wCDokBGVfMyqBbRHZrxwyGhF2UwyeabzH2FrYskzdXo69qvAFu7Vejt2MlpWXWQ142PxV74-fvHw&_sg%5B1%5D=Jp-HckeIeLRA7eMy0IHXBK01KV-4Q5XzsglZ7FrjcfxY9Xh11iEnZOOo77iBp2LtnNwkBVW8XbgAQmxrYdYdYkXVwNEjqohbJIOGby-LMIrQ._Qbh7bHF29wCDokBGVfMyqBbRHZrxwyGhF2UwyeabzH2FrYskzdXo69qvAFu7Vejt2MlpWXWQ142PxV74-fvHw&_iepl=&_rtd=eyJjb250ZW50SW50ZW50IjoibWFpbkl0ZW0ifQ%3D%3D&_tp=eyJjb250ZXh0Ijp7ImZpcnN0UGFnZSI6Il9kaXJlY3QiLCJwYWdlIjoicHVibGljYXRpb24iLCJwcmV2aW91c1BhZ2UiOiJwcm9maWxlIiwicG9zaXRpb24iOiJwYWdlSGVhZGVyIn19},\n}\n\n
\n
\n\n\n
\n Error correction code (ECC) processing has so far been performed on dedicated hardware for previous generations of mobile communication standards, to meet latency and bandwidth constraints. As the 5G mobile standard, and its associated channel coding algorithms, are now being specified, modern CPUs are progressing to the point where software channel decoders can viably be contemplated. A key aspect in reaching this transition point is to get the most of CPUs SIMD units on the decoding algorithms being pondered for 5G mobile standards. The nature and diversity of such algorithms requires highly versatile programming tools. This paper demonstrates the virtues and versatility of our MIPP SIMD wrapper in implementing a high performance portfolio of key ECC decoding algorithms.\n
\n\n\n
\n\n\n \n\n\n \n\n\n \n\n\n\n\n\n
\n
\n\n
\n
\n  \n mastersthesis\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n Étude et implémentation d’une méthode de calcul pour la simulation numérique sur des architectures modernes.\n \n \n\n\n \n Cassagne, A.\n\n\n \n\n\n\n Master's thesis, EPSI Bordeaux, 2015.\n \n\n\n\n
\n\n\n\n \n \n \"Étude paper\n  \n \n\n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@MastersThesis{Cassagne2015b,\n  author    = {A. Cassagne},\n  title     = {Étude et implémentation d’une méthode de calcul pour la simulation numérique sur des architectures modernes},\n  school    = {EPSI Bordeaux},\n  year      = {2015},\n  abstract  = {In this thesis we will study some modern hardware architectures using a well-known method in digital simulation: the stencil codes. The current HPC context is quite suitable for the arrival of new technologies leaded by the race to exascale computation. The comparative analysis in this thesis is mainly based on performance (number of floating point operations per second) and on hardware energy efficiency. We will start by formalising and clarifing the stencil method, then we will take a deeper look at three architectures : one standard x86 CPU, one low consumption ARM CPU and one GPU specialized in computations. Thereafter, we will describe some stencil optimisations in order to implement efficient versions of code for each of the architectures. We will explore both well-known methods like Cache Blocking and Register Blocking as well as less known ones such as Dimension Lifted and Transposed and Temporal Blocking. To finish, all these implementations will be tested on a low order stencil using the heat equation discretisation. The analysis will contain three different parts following the three architectures. We will use the Roofline model in order to bound the maximal reachable performance. Then we will study the code internal behavior on CPU and GPU by modifying the problem size. We will also take a look on the weak scalability in caches but only for the CPUs. Lastly, we will present a comparative analysis of energy consumption (also called energy to solution analysis).},\n  url_Paper = {https://largo.lip6.fr/~cassagnea/docs/reports/Cassagne2015b%20-%20Etude%20et%20implementation%20d%20une%20methode%20de%20calcul%20pour%20la%20simulation%20numerique%20sur%20des%20architectures%20modernes.pdf},\n}\n\n
\n
\n\n\n
\n In this thesis we will study some modern hardware architectures using a well-known method in digital simulation: the stencil codes. The current HPC context is quite suitable for the arrival of new technologies leaded by the race to exascale computation. The comparative analysis in this thesis is mainly based on performance (number of floating point operations per second) and on hardware energy efficiency. We will start by formalising and clarifing the stencil method, then we will take a deeper look at three architectures : one standard x86 CPU, one low consumption ARM CPU and one GPU specialized in computations. Thereafter, we will describe some stencil optimisations in order to implement efficient versions of code for each of the architectures. We will explore both well-known methods like Cache Blocking and Register Blocking as well as less known ones such as Dimension Lifted and Transposed and Temporal Blocking. To finish, all these implementations will be tested on a low order stencil using the heat equation discretisation. The analysis will contain three different parts following the three architectures. We will use the Roofline model in order to bound the maximal reachable performance. Then we will study the code internal behavior on CPU and GPU by modifying the problem size. We will also take a look on the weak scalability in caches but only for the CPUs. Lastly, we will present a comparative analysis of energy consumption (also called energy to solution analysis).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n Implémentation multi GPU de la méthode Spectral Differences pour un code de CFD.\n \n \n\n\n \n Cassagne, A.\n\n\n \n\n\n\n Master's thesis, Université de Bordeaux / EPSI, January 2014.\n \n\n\n\n
\n\n\n\n \n \n \"Implémentation paper\n  \n \n\n \n\n \n \n\n bibtex\n \n\n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@MastersThesis{Cassagne2014,\n  author    = {A. Cassagne},\n  title     = {Implémentation multi {GPU} de la méthode Spectral Differences pour un code de {CFD}},\n  school    = {Universit\\'e de Bordeaux / EPSI},\n  month     = jan,\n  year      = {2014},\n  url_Paper = {https://largo.lip6.fr/~cassagnea/docs/reports/Cassagne2014%20-%20Implementation%20multi%20GPU%20de%20la%20methode%20Spectral%20Differences%20pour%20un%20code%20de%20CFD.pdf},\n}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n misc\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n A Fast Forward Error Correction Toolbox: Seminary.\n \n \n\n\n \n Cassagne, A.; Hartmann, O.; Léonardon, M.; Leroux, C.; and Jégo, C.\n\n\n \n\n\n\n March 2018.\n Presentation at the IMS laboratory, Bordeaux, France.\n\n\n\n
\n\n\n\n \n \n \"A slides\n  \n \n\n \n\n \n \n\n bibtex\n \n\n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@Misc{Cassagne2018a,\n  author     = {A. Cassagne and O. Hartmann and M. L\\'eonardon and C. Leroux and C. J\\'ego},\n  title      = {A Fast Forward Error Correction Toolbox: Seminary},\n  month      = mar,\n  year       = {2018},\n  note       = {Presentation at the IMS laboratory, Bordeaux, France.},\n  keywords   = {AFF3CT},\n  url_Slides = {https://aff3ct.github.io/publications/Cassagne2018a - A Fast Forward Error Correction Toolbox Seminary.pdf},\n}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n phdthesis\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n Optimization and Parallelization Methods for the Software-Defined Radio.\n \n \n\n\n \n Cassagne, A.\n\n\n \n\n\n\n Ph.D. Thesis, Université de Bordeaux, December 2020.\n \n\n\n\n
\n\n\n\n \n \n \"Optimization paper\n  \n \n \n \"Optimization link\n  \n \n \n \"Optimization slides\n  \n \n\n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 13 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@PhdThesis{Cassagne2020,\n  author     = {A. Cassagne},\n  school     = {Universit{\\'e} de Bordeaux},\n  title      = {Optimization and Parallelization Methods for the Software-Defined Radio},\n  year       = {2020},\n  month      = dec,\n  abstract   = {A software-defined radio is a radio communication system where components traditionally implemented in hardware are instead implemented by means of software. With the growing number of complex digital communication standards and the general purpose processors increasing power, it becomes interesting to trade the energy efficiency of the dedicated architectures for the flexibility and the reduced time to market on general purpose processors. Even if the resulting implementation of a signal processing is made on an application-specific integrated circuit, the software version of this processing is necessary to evaluate and verify the correct properties of the functionality. This is generally the role of the simulation. Simulations are often expensive in terms of computational time. To evaluate the global performance of a communication system can require from few days to few weeks. In this context, this thesis proposes to study the most time consuming algorithms in today’s digital communication chains. These algorithms often are the channel decoders located on the receivers. The role of the channel coding is to improve the error resilience of the system. Indeed, errors can occur at the channel level during the transmission between the transmitter and the receiver. Three main channel coding families are then presented: the LDPC codes, the polar codes and the turbo codes. These three code families are used in most of the current digital communication standards like the Wi-Fi, the Ethernet, the 3G, 4G and 5G mobile networks, the digital television, etc. The resulting decoders offer the best compromise between error resistance and decoding speed known to date. Each of these families comes with specific decoding algorithms. One of the main challenge of this thesis is to propose optimized software implementations for each of them. Specific efficient implementations are proposed as well as more general optimization strategies. The idea is to extract the generic optimization strategies from a representative subset of decoders. The last part of the thesis focuses on the implementation of a complete digital communication system in software. Thanks to the efficient decoding implementations proposed before, a full transceiver, compatible with the DVB-S2 standard, is implemented. This standard is typically used for broadcasting multimedia contents via satellite. To this purpose, an embedded domain specific language targeting the software-defined radio is introduced. The main objective of this language is to take advantage of the parallel architecture of the current general purpose processors. The results show that the system achieves sufficient throughputs to be deployed in real-world conditions. These contributions have been made in a dynamic of openness, sharing and reusability, it results in an open source library named AFF3CT for A Fast Forward Error Correction Toolbox. Thus, all the results proposed in this thesis can easily be reproduced and extended. This philosophy is detailed in a specific chapter of the thesis manuscript.},\n  comment    = {Link to the presentation: https://lip6.fr/adrien.cassagne/docs/publications/Cassagne2020%20-%20Optimization%20and%20Parallelization%20Methods%20for%20the%20Software-Defined%20Radio%20%5bvideo%20phd%20defense%5d.mp4},\n  keywords   = {Software-Defined Radio, Functional Simulation, Error Correcting Codes, Software Implementation, Optimization, Parallelization, Open Source Code},\n  url_Paper  = {https://theses.hal.science/tel-03118420v1/file/CASSAGNE_ADRIEN_2020.pdf},\n  url_Link   = {https://www.theses.fr/2020BORD0231},\n  url_Slides = {https://largo.lip6.fr/~cassagnea/docs/publications/Cassagne2020%20-%20Optimization%20and%20Parallelization%20Methods%20for%20the%20Software-Defined%20Radio%20%5bvideo%20phd%20defense%5d.mp4},\n}\n\n
\n
\n\n\n
\n A software-defined radio is a radio communication system where components traditionally implemented in hardware are instead implemented by means of software. With the growing number of complex digital communication standards and the general purpose processors increasing power, it becomes interesting to trade the energy efficiency of the dedicated architectures for the flexibility and the reduced time to market on general purpose processors. Even if the resulting implementation of a signal processing is made on an application-specific integrated circuit, the software version of this processing is necessary to evaluate and verify the correct properties of the functionality. This is generally the role of the simulation. Simulations are often expensive in terms of computational time. To evaluate the global performance of a communication system can require from few days to few weeks. In this context, this thesis proposes to study the most time consuming algorithms in today’s digital communication chains. These algorithms often are the channel decoders located on the receivers. The role of the channel coding is to improve the error resilience of the system. Indeed, errors can occur at the channel level during the transmission between the transmitter and the receiver. Three main channel coding families are then presented: the LDPC codes, the polar codes and the turbo codes. These three code families are used in most of the current digital communication standards like the Wi-Fi, the Ethernet, the 3G, 4G and 5G mobile networks, the digital television, etc. The resulting decoders offer the best compromise between error resistance and decoding speed known to date. Each of these families comes with specific decoding algorithms. One of the main challenge of this thesis is to propose optimized software implementations for each of them. Specific efficient implementations are proposed as well as more general optimization strategies. The idea is to extract the generic optimization strategies from a representative subset of decoders. The last part of the thesis focuses on the implementation of a complete digital communication system in software. Thanks to the efficient decoding implementations proposed before, a full transceiver, compatible with the DVB-S2 standard, is implemented. This standard is typically used for broadcasting multimedia contents via satellite. To this purpose, an embedded domain specific language targeting the software-defined radio is introduced. The main objective of this language is to take advantage of the parallel architecture of the current general purpose processors. The results show that the system achieves sufficient throughputs to be deployed in real-world conditions. These contributions have been made in a dynamic of openness, sharing and reusability, it results in an open source library named AFF3CT for A Fast Forward Error Correction Toolbox. Thus, all the results proposed in this thesis can easily be reproduced and extended. This philosophy is detailed in a specific chapter of the thesis manuscript.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n techreport\n \n \n (3)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n High-order Method for a New Generation of Large Eddy Simulation Solver.\n \n \n\n\n \n Cassagne, A.; Boussuge, J.; and Puigt, G.\n\n\n \n\n\n\n Technical Report PRACE, 2015.\n \n\n\n\n
\n\n\n\n \n \n \"High-order paper\n  \n \n \n \"High-order link\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@TechReport{Cassagne2015a,\n  author      = {A. Cassagne and J-F. Boussuge and G. Puigt},\n  institution = {PRACE},\n  title       = {High-order Method for a New Generation of Large Eddy Simulation Solver},\n  year        = {2015},\n  abstract    = {We enabled hybrid OpenMP/MPI computations for a new generation of CFD code based on a new high-order method (Spectral Difference method) dedicated to Large Eddy Simulation (LES). The code is written in Fortran 90 with MPI library and OpenMP directives for the parallelization. This white-paper is focused on achieving good performances with the OpenMP shared memory model on standard environment (bi-socket nodes and multi-core x86 processors). The goal was to reduce the number of MPI communications by considering MPI communications between nodes and OpenMP approach for all cores on any node. Three different approaches are compared: full MPI, full OpenMP and hybrid OpenMP/MPI. We observed that hybrid and full MPI computations took nearly the same time for a small number of cores.},\n  doi         = {10.13140/RG.2.2.19469.79849},\n  keywords    = {jaguar, cfd, openmp, mpi},\n  url_Paper   = {https://hal.science/hal-01965638v1/file/Cassagne2015a%20-%20High-order%20Method%20for%20a%20New%20Generation%20of%20Large%20Eddy%20Simulation%20Solver.pdf},\n  url_Link    = {https://prace-ri.eu/training-support/technical-documentation/white-papers/application-scalability/},\n}\n\n
\n
\n\n\n
\n We enabled hybrid OpenMP/MPI computations for a new generation of CFD code based on a new high-order method (Spectral Difference method) dedicated to Large Eddy Simulation (LES). The code is written in Fortran 90 with MPI library and OpenMP directives for the parallelization. This white-paper is focused on achieving good performances with the OpenMP shared memory model on standard environment (bi-socket nodes and multi-core x86 processors). The goal was to reduce the number of MPI communications by considering MPI communications between nodes and OpenMP approach for all cores on any node. Three different approaches are compared: full MPI, full OpenMP and hybrid OpenMP/MPI. We observed that hybrid and full MPI computations took nearly the same time for a small number of cores.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n Portage d’un code de lattice QCD sur GPU.\n \n \n\n\n \n Cassagne, A.; Mortier, B.; Pasqualinotto, D.; and Fréchaud, V.\n\n\n \n\n\n\n Technical Report University of Bordeaux, March 2013.\n Projet d’Étude et de Développement (PED). In French.\n\n\n\n
\n\n\n\n \n \n \"Portage paper\n  \n \n\n \n\n \n \n\n bibtex\n \n\n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@TechReport{Cassagne2013b,\n  author      = {A. Cassagne and B. Mortier and D. Pasqualinotto and V. Fr\\'echaud},\n  title       = {Portage d’un code de lattice {QCD} sur {GPU}},\n  institution = {University of Bordeaux},\n  year        = {2013},\n  month       = mar,\n  note        = {Projet d’{\\'E}tude et de D{\\'e}veloppement (PED). In French.},\n  keywords    = {GPU, CUDA, OpenCL, QCD},\n  url_Paper   = {https://largo.lip6.fr/~cassagnea/docs/reports/Cassagne2013b%20-%20Portage%20d%e2%80%99un%20code%20de%20lattice%20QCD%20sur%20GPU.pdf},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n Concurrent Kernel Execution on Graphic Processing Units.\n \n \n\n\n \n Cassagne, A.; George, A.; Lorendeau, B.; Papin, J.; and Rougier, A.\n\n\n \n\n\n\n Technical Report Université de Bordeaux, January 2013.\n Projet d’Étude et de Recherche (PER).\n\n\n\n
\n\n\n\n \n \n \"Concurrent paper\n  \n \n\n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@TechReport{Cassagne2013a,\n  author      = {A. Cassagne and A. George and B. Lorendeau and J-C. Papin and A. Rougier},\n  title       = {Concurrent Kernel Execution on Graphic Processing Units},\n  institution = {Universit\\'e de Bordeaux},\n  year        = {2013},\n  month       = jan,\n  note        = {Projet d’{\\'E}tude et de Recherche (PER).},\n  abstract    = {General Purpose Graphic Processing Unit (GPGPU) are now used in high performance computing (HPC) for their massively parallel computing aspect and capabilities. Those devices integrate hundreds of computing unit (computing core). Usually, such a level of parallelism is used to solve simulation problems (heat transfer, …) because of the numerical representation of simulated environment (matrices). Those GPU can be programmed with specific programming languages like CUDA and OpenCL which provide a standard environment (C/C++ libraries). Programs executed on a GPU (also called kernels) are executed sequentially. However, in order to maximize the usage of GPU resources, some advanced features (developed by NVIDIA) allow programmers to execute severals kernels in parallel on the GPU. Unfortunately, concurrent kernels execution is only possible with CUDA on NVIDIA graphics cards. For other cards, OpenCL does not offer this functionality. That is why researchers from University of Virginia (USA) [2], tried to extend OpenCL standard by allowing execution of an "master kernel" which will launch other kernels. In fact, the "master kernel" is a mix of memory-bound and compute-bound kernels. By doing this, they could evaluate the advantage of this kind of solution. Another group of researchers (from University of George Washington and from University of Arkansas), designed a software environment that allows different threads from the same process to share access to the GPU, which wasn’t possible until the introduction of the "Automatic Context Funneling" [2] capabilities in CUDA 4.0. For our PER (Projet d’Etude et de Recherche), we will analyse the benefits and limitations of concurrent kernel execution. We will also determine if parallel kernel execution can be used to avoid the cost of data transfers from the host to the GPU (by starting long computing time kernel before starting data transfers).},\n  keywords    = {Computer science, CUDA, nVidia, nVidia GeForce GTX 660, nVidia Quadro 4000, OpenCL, Performance, GPU, Concurrent kernel},\n  url_Paper   = {https://largo.lip6.fr/~cassagnea/docs/reports/Cassagne2013a%20-%20Concurrent%20Kernel%20Execution%20on%20Graphic%20Processing%20Units.pdf},\n}\n
\n
\n\n\n
\n General Purpose Graphic Processing Unit (GPGPU) are now used in high performance computing (HPC) for their massively parallel computing aspect and capabilities. Those devices integrate hundreds of computing unit (computing core). Usually, such a level of parallelism is used to solve simulation problems (heat transfer, …) because of the numerical representation of simulated environment (matrices). Those GPU can be programmed with specific programming languages like CUDA and OpenCL which provide a standard environment (C/C++ libraries). Programs executed on a GPU (also called kernels) are executed sequentially. However, in order to maximize the usage of GPU resources, some advanced features (developed by NVIDIA) allow programmers to execute severals kernels in parallel on the GPU. Unfortunately, concurrent kernels execution is only possible with CUDA on NVIDIA graphics cards. For other cards, OpenCL does not offer this functionality. That is why researchers from University of Virginia (USA) [2], tried to extend OpenCL standard by allowing execution of an \"master kernel\" which will launch other kernels. In fact, the \"master kernel\" is a mix of memory-bound and compute-bound kernels. By doing this, they could evaluate the advantage of this kind of solution. Another group of researchers (from University of George Washington and from University of Arkansas), designed a software environment that allows different threads from the same process to share access to the GPU, which wasn’t possible until the introduction of the \"Automatic Context Funneling\" [2] capabilities in CUDA 4.0. For our PER (Projet d’Etude et de Recherche), we will analyse the benefits and limitations of concurrent kernel execution. We will also determine if parallel kernel execution can be used to avoid the cost of data transfers from the host to the GPU (by starting long computing time kernel before starting data transfers).\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n unpublished\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n OTAC: Optimal Scheduling for Pipelined and Replicated Task Chains for Software-Defined Radio.\n \n \n\n\n \n Orhan, D.; Lima Pilla, L.; Barthou, D.; Cassagne, A.; Aumage, O.; Tajan, R.; Jégo, C.; and Leroux, C.\n\n\n \n\n\n\n October 2023.\n Preprint.\n\n\n\n
\n\n\n\n \n \n \"OTAC: paper\n  \n \n \n \"OTAC: link\n  \n \n\n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@Unpublished{Orhan2023,\n  author      = {Orhan, Diane and Lima Pilla, La{\\'e}rcio and Barthou, Denis and Cassagne, Adrien and Aumage, Olivier and Tajan, Romain and J{\\'e}go, Christophe and Leroux, Camille},\n  note        = {Preprint.},\n  title       = {{OTAC}: Optimal Scheduling for Pipelined and Replicated Task Chains for Software-Defined Radio},\n  month       = Oct,\n  year        = {2023},\n  abstract    = {Software-Defined Radio (SDR) represents a move from dedicated hardware to software implementations of digital communication standards. This approach offers flexibility, shorter time to market, maintainability, and lower costs, but it requires an optimized distribution of SDR tasks in order to meet performance requirements. In this context, we study the problem of scheduling SDR linear task chains of stateless and stateful tasks. We model this problem as a pipelined workflow scheduling problem based on pipelined and replicated parallelism on homogeneous resources. Based on this model, we propose a scheduling algorithm named OTAC for maximizing throughput while also minimizing the number of allocated hardware resources, and we prove its optimality. We evaluate our approach and compare it to other algorithms in a simulation campaign, and with an actual implementation of the DVB-S2 communication standard on the AFF3CT SDR Domain Specific Language. Our results demonstrate how OTAC finds optimal schedules, leading consistently to better results than other algorithms, or equivalent results with much fewer hardware resources.},\n  hal_id      = {hal-04228117},\n  hal_version = {v1},\n  keywords    = {software-defined radio, pipelined worfklow scheduling, pipelining, replication, optimal algorithm, task chains},\n  url_Paper   = {https://hal.science/hal-04228117/file/otac-optimal-scheduling-hal.pdf},\n  url_Link    = {https://hal.science/hal-04228117},\n}\n\n
\n
\n\n\n
\n Software-Defined Radio (SDR) represents a move from dedicated hardware to software implementations of digital communication standards. This approach offers flexibility, shorter time to market, maintainability, and lower costs, but it requires an optimized distribution of SDR tasks in order to meet performance requirements. In this context, we study the problem of scheduling SDR linear task chains of stateless and stateful tasks. We model this problem as a pipelined workflow scheduling problem based on pipelined and replicated parallelism on homogeneous resources. Based on this model, we propose a scheduling algorithm named OTAC for maximizing throughput while also minimizing the number of allocated hardware resources, and we prove its optimality. We evaluate our approach and compare it to other algorithms in a simulation campaign, and with an actual implementation of the DVB-S2 communication standard on the AFF3CT SDR Domain Specific Language. Our results demonstrate how OTAC finds optimal schedules, leading consistently to better results than other algorithms, or equivalent results with much fewer hardware resources.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n A DSEL for High Throughput and Low Latency Software-Defined Radio on Multicore CPUs.\n \n \n\n\n \n Cassagne, A.; Tajan, R.; Aumage, O.; Barthou, D.; Leroux, C.; and Jégo, C.\n\n\n \n\n\n\n June 2022.\n Preprint.\n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n \n \"A link\n  \n \n\n \n \n doi\n  \n \n\n \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@Unpublished{Cassagne2022,\n  author    = {A. Cassagne and R. Tajan and O. Aumage and D. Barthou and C. Leroux and C. J\\'ego},\n  month     = jun,\n  title     = {A {DSEL} for High Throughput and Low Latency Software-Defined Radio on Multicore {CPU}s},\n  year      = {2022},\n  abstract  = {This article presents a new Domain Specific Embedded Language (DSEL) dedicated to Software-Defined Radio (SDR). From a set of carefully designed components, it enables to build efficient software digital communication systems, able to take advantage of the parallelism of modern processor architectures, in a straightforward and safe manner for the programmer. In particular, proposed DSEL enables the combination of pipelining and sequence duplication techniques to extract both temporal and spatial parallelism from digital communication systems. We leverage the DSEL capabilities on a real use case: a fully digital transceiver for the widely used DVB-S2 standard designed entirely in software. Through evaluation, we show how proposed software DVB-S2 transceiver is able to get the most from modern, high-end multicore CPU targets.},\n  doi       = {10.48550/ARXIV.2206.06147},\n  publisher = {arXiv},\n  url_Paper = {https://arxiv.org/pdf/2206.06147.pdf},\n  url_Link  = {https://arxiv.org/abs/2206.06147},\n  note      = {Preprint.},\n}\n\n
\n
\n\n\n
\n This article presents a new Domain Specific Embedded Language (DSEL) dedicated to Software-Defined Radio (SDR). From a set of carefully designed components, it enables to build efficient software digital communication systems, able to take advantage of the parallelism of modern processor architectures, in a straightforward and safe manner for the programmer. In particular, proposed DSEL enables the combination of pipelining and sequence duplication techniques to extract both temporal and spatial parallelism from digital communication systems. We leverage the DSEL capabilities on a real use case: a fully digital transceiver for the widely used DVB-S2 standard designed entirely in software. Through evaluation, we show how proposed software DVB-S2 transceiver is able to get the most from modern, high-end multicore CPU targets.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n\n\n\n
\n\n\n \n\n \n \n \n \n\n
\n"}; document.write(bibbase_data.data);